#include "opencl_source_map.hpp" 
namespace MNN { 
const char* layernorm = 
"#ifdef MNN_SUPPORT_FP16\n"
"#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
"#endif\n"
"__constant sampler_t SAMPLER=CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;\n"
"#ifdef LOCAL_SIZE\n"
"__kernel void layernorm_w(__private int global_dim0,__private int global_dim1,__private int global_dim2,\n"
" __read_only image2d_t input,\n"
" __write_only image2d_t output,\n"
" __private const int width,\n"
" __private const int height,\n"
" __private const int channel,\n"
"#ifdef GAMMA_BETA\n"
" __global const FLOAT *gamma,\n"
" __global const FLOAT *beta,\n"
"#endif\n"
" __private float epsilon){\n"
" int3 pos=(int3)(get_global_id(0),get_global_id(1),get_global_id(2));\n"
" float4 local sum_mnn[LOCAL_SIZE];\n"
" #ifndef RMSNORM\n"
" float4 local sum_mean_mnn[LOCAL_SIZE];\n"
" #endif\n"
" if (pos.x<global_dim0 && pos.y<global_dim1 && pos.z<global_dim2) {\n"
" const int h=pos.y % height;\n"
" const int c=pos.y/height;\n"
" const int b=pos.z;\n"
" const int lid=get_local_id(0);\n"
" const int bh_offset=mad24(b,height,h);\n"
" float4 in_sum=0;\n"
"#ifdef RMSNORM\n"
" float4 mean=0;\n"
"#else\n"
" for(int i=lid; i<width; i+=LOCAL_SIZE){\n"
" float4 in=convert_float4(RI_F(input,SAMPLER,(int2)(c*width+i,bh_offset)));\n"
" in_sum += in;\n"
" }\n"
" sum_mean_mnn[lid]=in_sum;\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" for(int i=LOCAL_SIZE/2; i>0; i /= 2){\n"
" if (lid<i)\n"
" sum_mean_mnn[lid]=sum_mean_mnn[lid]+sum_mean_mnn[lid+i];\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" }\n"
" \n"
" float4 mean=sum_mean_mnn[0]/(float4)width;\n"
"#endif\n"
" in_sum=0;\n"
" for(int i=lid; i<width; i+=LOCAL_SIZE){\n"
" float4 in=convert_float4(RI_F(input,SAMPLER,(int2)(c*width+i,bh_offset)));\n"
" in_sum += (in-mean)*(in-mean);\n"
" }\n"
" sum_mnn[lid]=in_sum;\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" for(int i=LOCAL_SIZE/2; i>0; i /= 2){\n"
" if (lid<i)\n"
" sum_mnn[lid]=sum_mnn[lid]+sum_mnn[lid+i];\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" }\n"
" float4 square_sum=sum_mnn[0]/(float4)width;\n"
" float4 value=(float4)1.0f/(float4)sqrt(square_sum+(float4)epsilon);\n"
" for(int i=lid; i<width; i+=LOCAL_SIZE){\n"
" float4 in=convert_float4(RI_F(input,SAMPLER,(int2)(c*width+i,bh_offset)));\n"
"#ifdef GAMMA_BETA\n"
" float4 out=(in-mean)*value*(float4)gamma[i]+(float4)beta[i];\n"
"#else\n"
" float4 out=(in-mean)*value;\n"
"#endif\n"
" WI_F(output,(int2)(c*width+i,bh_offset),CONVERT_FLOAT4(out));\n"
" }\n"
" }\n"
"}\n"
"__kernel void layernorm_hw(__private int global_dim0,__private int global_dim1,__private int global_dim2,\n"
" __read_only image2d_t input,\n"
" __write_only image2d_t output,\n"
" __private const int width,\n"
" __private const int height,\n"
" __private const int channel,\n"
"#ifdef GAMMA_BETA\n"
" __global const FLOAT *gamma,\n"
" __global const FLOAT *beta,\n"
"#endif\n"
" __private float epsilon){\n"
" int3 pos=(int3)(get_global_id(0),get_global_id(1),get_global_id(2));\n"
" float4 local sum_mnn[LOCAL_SIZE];\n"
" #ifndef RMSNORM\n"
" float4 local sum_mean_mnn[LOCAL_SIZE];\n"
" #endif\n"
" if (pos.x<global_dim0 && pos.y<global_dim1 && pos.z<global_dim2) {\n"
" const int c=pos.y;\n"
" const int b=pos.z;\n"
" const int height_width=height*width;\n"
" const int lid=get_local_id(0);\n"
" float4 in_sum=0;\n"
"#ifdef RMSNORM\n"
" float4 mean=0;\n"
"#else\n"
" for(int i=lid; i<height_width; i+=LOCAL_SIZE){\n"
" int w=i % width;\n"
" int h=i/width;\n"
" float4 in=convert_float4(RI_F(input,SAMPLER,(int2)(c*width+w,b*height+h)));\n"
" in_sum += in;\n"
" }\n"
" sum_mean_mnn[lid]=in_sum;\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" for(int i=LOCAL_SIZE/2; i>0; i /= 2){\n"
" if (lid<i)\n"
" sum_mean_mnn[lid]=sum_mean_mnn[lid]+sum_mean_mnn[lid+i];\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" }\n"
" \n"
" float4 mean=sum_mean_mnn[0]/(float4)height_width;\n"
"#endif\n"
" in_sum=0;\n"
" for(int i=lid; i<height_width; i+=LOCAL_SIZE){\n"
" int w=i % width;\n"
" int h=i/width;\n"
" float4 in=convert_float4(RI_F(input,SAMPLER,(int2)(c*width+w,b*height+h)));\n"
" in_sum += (in-mean)*(in-mean);\n"
" }\n"
" sum_mnn[lid]=in_sum;\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" for(int i=LOCAL_SIZE/2; i>0; i /= 2){\n"
" if (lid<i)\n"
" sum_mnn[lid]=sum_mnn[lid]+sum_mnn[lid+i];\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" }\n"
" float4 square_sum=sum_mnn[0]/(float4)height_width;\n"
" float4 value=(float4)1.0f/(float4)sqrt(square_sum+(float4)epsilon);\n"
" for(int i=lid; i<height_width; i+=LOCAL_SIZE){\n"
" int w=i % width;\n"
" int h=i/width;\n"
" float4 in=convert_float4(RI_F(input,SAMPLER,(int2)(c*width+w,b*height+h)));\n"
"#ifdef GAMMA_BETA\n"
" float4 out=(in-mean)*value*(float4)gamma[i]+(float4)beta[i];\n"
"#else\n"
" float4 out=(in-mean)*value;\n"
"#endif\n"
" WI_F(output,(int2)(c*width+w,b*height+h),CONVERT_FLOAT4(out));\n"
" }\n"
" }\n"
"}\n"
"__kernel void layernorm_chw(__private int global_dim0,__private int global_dim1,__private int global_dim2,\n"
" __read_only image2d_t input,\n"
" __write_only image2d_t output,\n"
" __private const int width,\n"
" __private const int height,\n"
" __private const int channel,\n"
"#ifdef GAMMA_BETA\n"
" __global const FLOAT *gamma,\n"
" __global const FLOAT *beta,\n"
"#endif\n"
" __private float epsilon){\n"
" int3 pos=(int3)(get_global_id(0),get_global_id(1),get_global_id(2));\n"
" float local sum_mnn[LOCAL_SIZE];\n"
" #ifndef RMSNORM\n"
" float4 local sum_mean_mnn[LOCAL_SIZE];\n"
" #endif\n"
" if (pos.x<global_dim0 && pos.y<global_dim1 && pos.z<global_dim2) {\n"
" const int b=pos.z;\n"
" const int sum_size=width*height*channel;\n"
" const int reduce_size=width*height;\n"
" const int lid=get_local_id(0);\n"
" const int channel4=(channel+3)/4;\n"
" const int channel_remain=channel-(channel4-1)*4;\n"
" \n"
" float4 in_sum=0;\n"
" float4 in_sum_left=0;\n"
" float *in_sum_left_ptr=(float*)(&in_sum_left);\n"
"#ifdef RMSNORM\n"
" float4 mean=0;\n"
"#else\n"
" for(int c=0; c<channel4-1; ++c){\n"
" for(int i=lid; i<reduce_size; i+=LOCAL_SIZE){\n"
" int w=i % width;\n"
" int h=i/width;\n"
" float4 in=convert_float4(RI_F(input,SAMPLER,(int2)(c*width+w,b*height+h)));\n"
" in_sum += in;\n"
" }\n"
" }\n"
" for(int i=lid; i<reduce_size; i+=LOCAL_SIZE){\n"
" int w=i % width;\n"
" int h=i/width;\n"
" float4 in=convert_float4(RI_F(input,SAMPLER,(int2)((channel4-1)*width+w,b*height+h)));\n"
" in_sum_left += in;\n"
" }\n"
" in_sum.x=in_sum.x+in_sum.y+in_sum.z+in_sum.w;\n"
" for(int i=1; i<channel_remain; ++i){\n"
" in_sum_left_ptr[0] += in_sum_left_ptr[i];\n"
" }\n"
" sum_mean_mnn[lid]=in_sum.x+in_sum_left.x;\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" for(int i=LOCAL_SIZE/2; i>0; i /= 2){\n"
" if (lid<i)\n"
" sum_mean_mnn[lid]=sum_mean_mnn[lid]+sum_mean_mnn[lid+i];\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" }\n"
" \n"
" float4 mean=sum_mean_mnn[0]/(float4)sum_size;\n"
"#endif\n"
" in_sum=0;\n"
" in_sum_left=0;\n"
" for(int c=0; c<channel4-1; ++c){\n"
" for(int i=lid; i<reduce_size; i+=LOCAL_SIZE){\n"
" int w=i % width;\n"
" int h=i/width;\n"
" float4 in=convert_float4(RI_F(input,SAMPLER,(int2)(c*width+w,b*height+h)));\n"
" in_sum += (in-mean)*(in-mean);\n"
" }\n"
" }\n"
" \n"
" for(int i=lid; i<reduce_size; i+=LOCAL_SIZE){\n"
" int w=i % width;\n"
" int h=i/width;\n"
" float4 in=convert_float4(RI_F(input,SAMPLER,(int2)((channel4-1)*width+w,b*height+h)));\n"
" in_sum_left += (in-mean)*(in-mean);\n"
" }\n"
" \n"
" in_sum.x=in_sum.x+in_sum.y+in_sum.z+in_sum.w;\n"
" for(int i=1; i<channel_remain; ++i){\n"
" in_sum_left_ptr[0] += in_sum_left_ptr[i];\n"
" }\n"
" \n"
" sum_mnn[lid]=in_sum.x+in_sum_left.x;\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" for(int i=LOCAL_SIZE/2; i>0; i /= 2){\n"
" if (lid<i)\n"
" sum_mnn[lid]=sum_mnn[lid]+sum_mnn[lid+i];\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" }\n"
" float4 square_sum=sum_mnn[0]/(float4)sum_size;\n"
" float4 value=(float4)1.0f/(float4)sqrt(square_sum+(float4)epsilon);\n"
" for(int c=0; c<channel4; ++c){\n"
" for(int i=lid; i<reduce_size; i+=LOCAL_SIZE){\n"
" int w=i % width;\n"
" int h=i/width;\n"
" float4 in=convert_float4(RI_F(input,SAMPLER,(int2)(c*width+w,b*height+h)));\n"
"#ifdef GAMMA_BETA\n"
" float4 out=(in-mean)*value*(float4)gamma[c*reduce_size+i]+(float4)beta[c*reduce_size+i];\n"
"#else\n"
" float4 out=(in-mean)*value;\n"
"#endif\n"
" WI_F(output,(int2)(c*width+w,b*height+h),CONVERT_FLOAT4(out));\n"
" }\n"
" }\n"
" }\n"
"}\n"
"#endif\n"
;
}
